I downloaded manually (argh!) all the files you can find in the data folder (in this GitHub repo), from the website http://www.vivc.de, specifically all the SSR (microsatellite markers) info available and the passport data (that is, color of the berry and country of origin of the variety) for all the cultivars

If someone out there would be so kind to help me in fetching data on this website in an automated and recursive way, with any of the wonderful R or Python libraries available, it would be of great help. The problem is that to download data from vivc.de one needs to generate an Excel table selecting a certain number of varieties and it is not possible to get all the info at once

Apart from that, I did the job for you (it did not take too long actually =) ) so here are the data

Data preparation

First load some libraries

library(prettydoc); library(kableExtra); library(data.table); library(knitr); library(xlsx); library(readxl); library(dplyr); library(ggplot2); library(poppr); library(ggsci); library(forcats); library(scales)
length(dir('data/DB_complete/', recursive = T))
## [1] 129
length(dir('data/SSR_data/', recursive = T))
## [1] 206
head(dir('data/DB_complete/', recursive = T))
## [1] "grid-export100.xlsx" "grid-export101.xlsx" "grid-export102.xlsx"
## [4] "grid-export103.xlsx" "grid-export104.xlsx" "grid-export105.xlsx"
head(dir('data/SSR_data/', recursive = T))
## [1] "1001-1020.xlsx" "101-120.xlsx"   "1021-1040.xlsx" "1041-1060.xlsx"
## [5] "1061-1080.xlsx" "1081-1100.xlsx"

Let’s load the complete database, listing all the files with ‘.xlsx’ in the DB_complete subfolder and applying the read.xlsx function to this list

vivc.list <- list.files(path = 'data/DB_complete/',
                        pattern = '.xlsx',
                        full.names = T,
                        recursive = T)

length(vivc.list)
## [1] 129
vivc.files <- lapply(vivc.list, function(x) read.xlsx(x, sheetName  = 'Worksheet'))

names(vivc.files) <- vivc.list

length(vivc.files)
## [1] 129

Let’s check the structure of a single file

Prime.name Color.of.berry.skin Variety.number.VIVC Country.of.origin.of.the.variety
ROMANINA BLANC 17263 PORTUGAL
ROMANTRA ROUGE 16074 NA
ROMBOLA NOIR 10179 GREECE
ROME BLANC 16035 SPAIN
ROME NOIR 10181 SPAIN
ROMEIKO NOIR 10182 GREECE
Prime.name Color.of.berry.skin Variety.number.VIVC Country.of.origin.of.the.variety
183 NOIR 41139 NA
A LA REINE BLANC 1 NA
ABAAJICHE BLANC 19069 GEORGIA
ABACA BLANC 15528 SPAIN
ABADESA BLANC 15529 SPAIN
ABADI NA 24154 ALGERIA

Since all the files have the same structure we can call rbind on the list

vivc.all <- do.call('rbind', vivc.files)

kable(head(vivc.all), align = 'l', row.names = F)
Prime.name Color.of.berry.skin Variety.number.VIVC Country.of.origin.of.the.variety
ROMANINA BLANC 17263 PORTUGAL
ROMANTRA ROUGE 16074 NA
ROMBOLA NOIR 10179 GREECE
ROME BLANC 16035 SPAIN
ROME NOIR 10181 SPAIN
ROMEIKO NOIR 10182 GREECE
dim(vivc.all)
## [1] 12862     4
rownames(vivc.all) <- c(1:12862)

DT::datatable(vivc.all, rownames = F, filter = "top")

Check if there are duplicated entries

dim(vivc.all[duplicated(vivc.all), ])
## [1] 0 4

There are no duplicated entries. Let’s plot some statistics about berry color

ggplot(vivc.all, aes(Color.of.berry.skin)) +
  geom_bar() +
  labs(x="Color of berry skin", y="Count") +
  theme_bw()

And some about country of origin

ggplot(vivc.all, aes(fct_infreq(Country.of.origin.of.the.variety))) +
  geom_bar() +
  labs(x="Country of origin", y="Count") +
  theme_bw() +
  theme(axis.text.x=element_text(angle=90,hjust=1, size = 4))

Italy rules over France!!

At this stage I want to load the SSR info and merge them with the entire database. Same steps as before

ssr.list <- list.files(path = 'data/SSR_data/',
                       pattern = '.xlsx',
                       full.names = T)

ssr.files <- lapply(ssr.list, function(x) read.xlsx(x, sheetName  = 'Worksheet'))

names(ssr.files) <- ssr.list

length(ssr.files)
## [1] 206

Let’s check the structure of a single file

kable(ssr.files[[1]][1:6,1:8], align = 'l', row.names = F)
Reference.variety VVS2A1 VVS2A2 VVMD5A1 VVMD5A2 VVMD7A1 VVMD7A2 VVMD25A1
AGLIANICO 151 155 234 248 239 239 249
ARAGATSI 151 155 236 236 239 243 245
CORVINA VERONESE 151 155 234 242 239 239 241
FOSTER’S WHITE SEEDLING 151 155 230 238 247 249 241
HRVATICA 151 153 228 240 239 247 241
KANDAHARI SAFID 151 155 236 242 239 253 245
kable(ssr.files[[206]][1:6,1:8], align = 'l', row.names = F)
Reference.variety VVS2A1 VVS2A2 VVMD5A1 VVMD5A2 VVMD7A1 VVMD7A2 VVMD25A1
ABOUHOU 151 151 238 242 249 249 249
BOUZOUGA 151 151 240 242 249 253 255
JUBILAEUMSREBE 151 151 234 242 243 247 249
RANNII VIRA 151 151 236 240 249 249 249
RATINO 151 151 234 240 243 263 NA
RUSHAKI 151 151 236 240 239 247 249
# same structure over all files, call rbind

ssr.all <- do.call('rbind', ssr.files)

kable(ssr.all[1:6,1:8], align = 'l', row.names = F)
Reference.variety VVS2A1 VVS2A2 VVMD5A1 VVMD5A2 VVMD7A1 VVMD7A2 VVMD25A1
AGLIANICO 151 155 234 248 239 239 249
ARAGATSI 151 155 236 236 239 243 245
CORVINA VERONESE 151 155 234 242 239 239 241
FOSTER’S WHITE SEEDLING 151 155 230 238 247 249 241
HRVATICA 151 153 228 240 239 247 241
KANDAHARI SAFID 151 155 236 242 239 253 245
dim(ssr.all)
## [1] 3688   19
str(ssr.all)
## 'data.frame':    3688 obs. of  19 variables:
##  $ Reference.variety: Factor w/ 3619 levels "AGLIANICO","ARAGATSI",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ VVS2A1           : num  151 151 151 151 151 151 151 151 151 151 ...
##  $ VVS2A2           : num  155 155 155 155 153 155 155 155 155 155 ...
##  $ VVMD5A1          : num  234 236 234 230 228 236 242 228 228 236 ...
##  $ VVMD5A2          : num  248 236 242 238 240 242 242 238 238 236 ...
##  $ VVMD7A1          : num  239 239 239 247 239 239 243 247 247 249 ...
##  $ VVMD7A2          : num  239 243 239 249 247 253 253 257 257 253 ...
##  $ VVMD25A1         : num  249 245 241 241 241 245 245 241 241 245 ...
##  $ VVMD25A2         : num  263 249 263 255 263 249 249 249 249 249 ...
##  $ VVMD27A1         : num  184 182 180 186 180 195 180 186 186 182 ...
##  $ VVMD27A2         : num  190 195 190 186 180 195 195 190 190 195 ...
##  $ VVMD28A1         : num  228 244 258 244 254 218 234 228 228 218 ...
##  $ VVMD28A2         : num  258 244 258 260 278 234 236 236 236 246 ...
##  $ VVMD32A1         : num  250 250 240 252 250 250 256 272 272 250 ...
##  $ VVMD32A2         : chr  "256" "272" "272" "272" ...
##  $ VrZAG62A1        : num  188 186 188 194 188 188 188 194 194 188 ...
##  $ VrZAG62A2        : num  188 194 194 202 204 188 188 194 194 202 ...
##  $ VrZAG79A1        : num  245 247 251 237 251 257 247 239 239 247 ...
##  $ VrZAG79A2        : num  247 247 251 239 251 259 257 245 245 251 ...

There’s a problem with the column VVMD32A2, a single value is saved as two allele size. Since I don’t know which one is correct, I replace it with missing value

ssr.all$VVMD32A2 <- gsub('272/274', 'NA', ssr.all$VVMD32A2)

ssr.all$VVMD32A2 <- as.numeric(ssr.all$VVMD32A2)
## Warning: si è prodotto un NA per coercizione
rownames(ssr.all) <- c(1:3688)

DT::datatable(ssr.all, rownames = F, filter = 'top')

Check if there are duplicated cultivars

dim(ssr.all[duplicated(ssr.all), ])
## [1] 49 19
sum(duplicated(ssr.all$Reference.variety))
## [1] 69
#kable(head(ssr.all[duplicated(ssr.all[,1]),]), align='l', row.names = F)

#ssr.all[duplicated(ssr.all$Reference.variety),]

There are some duplicated names. I want to keep all of them and give different names to the ones with same name. Later I will explore if same name means also same SSR profile

rownames(ssr.all) = make.names(names = ssr.all$Reference.variety, unique=TRUE)

#ssr.all[duplicated(ssr.all$Reference.variety),] %>%
#  .[order(as.character(.$Reference.variety)),] %>%
#  kable(., align='l', row.names = T)

Now the duplicated have different names, and I replace the old names with the new ones with different numbers corresponding to more synonyms

Reference.variety VVS2A1 VVS2A2 VVMD5A1 VVMD5A2 VVMD7A1 VVMD7A2 VVMD25A1
AGLIANICO 151 155 234 248 239 239 249
ARAGATSI 151 155 236 236 239 243 245
CORVINA.VERONESE 151 155 234 242 239 239 241
FOSTER.S.WHITE.SEEDLING 151 155 230 238 247 249 241
HRVATICA 151 153 228 240 239 247 241
KANDAHARI.SAFID 151 155 236 242 239 253 245

Now I can merge the complete db with the SSR file. Before I check if there are duplicated names and change the names of the complete db with the same method as before, so to uniform it with the SSR file

sum(duplicated(vivc.all$Prime.name))
## [1] 86

There are 86 entries with the same name. I create unique names adding a number to the duplicated entries

rownames(vivc.all) = make.names(names = vivc.all$Prime.name, unique=TRUE)

vivc.all$Prime.name <- rownames(vivc.all)

kable(vivc.all[1:6,], align = 'l', row.names = F)
Prime.name Color.of.berry.skin Variety.number.VIVC Country.of.origin.of.the.variety
ROMANINA BLANC 17263 PORTUGAL
ROMANTRA ROUGE 16074 NA
ROMBOLA NOIR 10179 GREECE
ROME BLANC 16035 SPAIN
ROME.1 NOIR 10181 SPAIN
ROMEIKO NOIR 10182 GREECE
sum(duplicated(vivc.all$Prime.name))
## [1] 0

Now I can merge the two files, keeping only the entries in common

ssr.with.info <- merge(ssr.all, vivc.all, by.x='Reference.variety', by.y='Prime.name', all=F)

dim(ssr.with.info)
## [1] 3221   22

In this way we exclude around 400 cultivars with no complete info. Let’s double check also for duplicated entries

kable(ssr.with.info[1:6,c(1:2,19:22)], align = 'l', row.names = F)
Reference.variety VVS2A1 VrZAG79A2 Color.of.berry.skin Variety.number.VIVC Country.of.origin.of.the.variety
ABADI 133 NA NA 24154 ALGERIA
ABBOU 137 NA NOIR 141 MOROCCO
ABBUOTO 135 251 NOIR 7 ITALY
ABDERAZAK.BEN.HAMAMA 133 NA NA 24155 ALGERIA
ABERKANE 137 259 NOIR 13 ALGERIA
ABJOUCH 151 257 BLANC 17 AFGHANISTAN
sum(duplicated(ssr.with.info))
## [1] 0
sum(duplicated(ssr.with.info$Reference.variety))
## [1] 0

First steps with adegenet functions within the poppr package (‘poppr’ in CRAN)

Before converting the data frame to a genind object that will be used in adegenet, I want to check how many unique country of origins exist, since they will be used as populations for the next analysis. The idea is to group different countries within the same ‘population’. To this aim I manually checked all the countries and tried to identify major populations

kable(table(ssr.with.info$Country.of.origin.of.the.variety), align = 'l', row.names = F)
Var1 Freq
AUSTRIA 41
BULGARIA 54
EGYPT 2
FRANCE 457
GERMANY 120
GREECE 98
HUNGARY 164
IRAN 24
ITALY 655
JAPAN 1
MOLDOVA 27
PORTUGAL 238
RUSSIAN FEDERATION 31
SLOVAKIA 9
SOUTH AFRICA 15
SPAIN 244
SWITZERLAND 26
TUNISIA 71
TURKEY 52
UNITED STATES OF AMERICA 41
AFGHANISTAN 16
ALBANIA 13
ARGENTINA 35
ARMENIA 26
AUSTRALIA 12
BELGIUM 3
BRAZIL 2
CHILE 0
CHINA 11
CROATIA 45
CZECH REPUBLIC 15
DAGHESTAN 37
GEORGIA 30
MEXICO 0
ROMANIA 45
SERBIA 22
SUN (FORMERLY USSR, UNION OF SOVIET SOCIALIST REPUBLICS) 21
UKRAINE 69
UNITED KINGDOM 11
YUGOSLAVIA 20
ALGERIA 33
AZERBEIJAN 27
IRAQ 1
TAJIKISTAN 18
UZBEKISTAN 52
SYRIAN ARAB REPUBLIC 8
KYRGYZSTAN 0
TURKMENISTAN 15
ISRAEL 28
LEBANON 6
MOROCCO 33
CYPRUS 7
INDIA 2
BALKAN 6
MONTENEGRO 3
SLOVENIA 20
YEMEN 4
MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF 6
PALESTINIAN TERRITORY, OCCUPIED 1
DALMATIEN 3
KAZAKHSTAN 1
NETHERLANDS 2
CANADA 0
EUROPE 0
BOSNIA AND HERZEGOWINA 4
JORDAN 2
ASIA MINOR 2
MALTA 7
COSTA RICA 0
(FORMERLY CZECHOSLOVAKIA) 0
PERU 1
length(table(ssr.with.info$Country.of.origin.of.the.variety))
## [1] 71

The number of distinct countries is 71. Outside R, after long hours of meditation, I created a new country coding scheme, that is found in the data folder

new.country.codes <- read.table('data/new_countries_code.txt',
                                col.names = c('old', 'new'),
                                header = F,
                                sep='\t')

kable(new.country.codes, align = 'l', row.names = F)
old new
(FORMERLY CZECHOSLOVAKIA) CE
AFGHANISTAN STAN
ALBANIA EE
ALGERIA NoA
ARGENTINA SUSA
ARMENIA ASIA
ASIA MINOR ASIA
AUSTRALIA AUS
AUSTRIA CE
AZERBEIJAN STAN
BALKAN EE
BELGIUM NE
BOSNIA AND HERZEGOWINA EE
BRAZIL SUSA
BULGARIA EE
CANADA CAN
CHILE SUSA
CHINA ASIA
COSTA RICA SUSA
CROATIA EE
CYPRUS ASIA
CZECH REPUBLIC CE
DAGHESTAN STAN
DALMATIEN EE
EGYPT NoA
EUROPE CE
FRANCE WE
GEORGIA ASIA
GERMANY CE
GREECE EE
HUNGARY EE
INDIA ASIA
IRAN ASIA
IRAQ ASIA
ISRAEL ASIA
ITALY SE
JAPAN ASIA
JORDAN ASIA
KAZAKHSTAN STAN
KYRGYZSTAN STAN
LEBANON ASIA
MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF EE
MALTA EE
MEXICO SUSA
MOLDOVA EE
MONTENEGRO EE
MOROCCO NoA
NETHERLANDS CE
PALESTINIAN TERRITORY, OCCUPIED ASIA
PERU SUSA
PORTUGAL WE
ROMANIA EE
RUSSIAN FEDERATION EE
SERBIA EE
SLOVAKIA EE
SLOVENIA EE
SOUTH AFRICA SA
SPAIN WE
SUN (FORMERLY USSR, UNION OF SOVIET SOCIALIST REPUBLICS) ASIA
SWITZERLAND CE
SYRIAN ARAB REPUBLIC ASIA
TAJIKISTAN STAN
TUNISIA NoA
TURKEY ASIA
TURKMENISTAN STAN
UKRAINE EE
UNITED KINGDOM NE
UNITED STATES OF AMERICA USA
UZBEKISTAN STAN
YEMEN ASIA
YUGOSLAVIA EE
length(table(new.country.codes$new))
## [1] 13

There are now 13 unique ‘populations’. If you want to know what ‘STAN’ stands for, check this code conversion table

Table Header Second Header
CE Center Europe
EE Eastern Europe
NE North Europe
NoA North Africa
SA South Africa
SE Southern Europe
STAN All the countries of middle-east which ends in ‘stan’ (except AZERBAIJAN)
SUSA South and Center America
WE Western Europe

Now we merge the new code file with the SSR data frame to have the new 13 country codes all together with the cultivars info

ssr.complete <- merge(ssr.with.info, new.country.codes,
                      by.x = 'Country.of.origin.of.the.variety',
                      by.y = 'old',
                      all = T)

kable(ssr.complete[1:6, c(1:4,23)], align = 'l', row.names = F)
Country.of.origin.of.the.variety Reference.variety VVS2A1 VVS2A2 new
AUSTRIA ZWEIGELTREBE.BLAU 137 143 CE
AUSTRIA VELTLINER.BRAUN 137 151 CE
AUSTRIA KOELNER.BLAU 133 153 CE
AUSTRIA RASTIGNIER 135 143 CE
AUSTRIA LAGLER.WEISS 133 151 CE
AUSTRIA HAINER.GRUEN 133 143 CE
dim(ssr.complete)
## [1] 3228   23

Using the option all = T with the merge command created some entries with NA in the names. Let’s remove them. We also need to remove the factors that are not present anymore after removing these entries

levels <- levels(ssr.complete$Country.of.origin.of.the.variety)

levels[length(levels) + 1] <- "Unknown"

# refactor cultivars to include "Unknown" as a factor level
# and replace NA with "Unknown"
ssr.complete$Country.of.origin.of.the.variety <- factor(ssr.complete$Country.of.origin.of.the.variety, levels = levels)

ssr.complete$Country.of.origin.of.the.variety[is.na(ssr.complete$Country.of.origin.of.the.variety)] <- "Unknown"

levels <- levels(ssr.complete$new)

levels[length(levels) + 1] <- "Unk"

# refactor cultivars to include "Unknown" as a factor level
# and replace NA with "Unknown"
ssr.complete$new <- factor(ssr.complete$new, levels = levels)

ssr.complete$new[is.na(ssr.complete$new)] <- "Unk"

ssr.complete <- ssr.complete[!is.na(ssr.complete$Reference.variety), ]

sum(is.na(ssr.complete$Reference.variety))
## [1] 0
kable(ssr.complete[1:6, c(1:4,23)], align = 'l', row.names = F)
Country.of.origin.of.the.variety Reference.variety VVS2A1 VVS2A2 new
AUSTRIA ZWEIGELTREBE.BLAU 137 143 CE
AUSTRIA VELTLINER.BRAUN 137 151 CE
AUSTRIA KOELNER.BLAU 133 153 CE
AUSTRIA RASTIGNIER 135 143 CE
AUSTRIA LAGLER.WEISS 133 151 CE
AUSTRIA HAINER.GRUEN 133 143 CE
ssr.complete$Country.of.origin.of.the.variety <- droplevels(ssr.complete$Country.of.origin.of.the.variety)

ssr.complete$new <- droplevels(ssr.complete$new)

Since the data frame is formatted with two columns for each locus (one allele each column), we first need to convert it to the format: one column each locus, with the two alleles separated by a slash (/). Then we will use the function df2genind from adegenet to convert the df to a genind object

There are 9 SSR in the database; since grapevine is diploid each SSR locus has two alleles, so 18 columns in total. We create 9 new columns where we paste the content of each adjacent columns (being the df organized in this way, that is the two alleles of the same locus in adjacent columns)

col.to.add <- 24:32

ssr.complete[, col.to.add] <- (sapply(seq(from = 3,
                                          to = 20,
                                          by=2),
                                      function(i) paste0(ssr.complete[,i],'/',
                                                         ssr.complete[,i+1]))
                               )

ssr.df.for.adegenet <- ssr.complete[,c(2,23:32)]

names(ssr.df.for.adegenet)[2] <- 'pop'

names(ssr.df.for.adegenet)[3:11] <- c('VVSA2', 'VVMD5', 'VVMD7', 'VVMD25', 'VVMD27', 'VVMD28', 'VVMD32', 'VrZAG62', 'VrZAG79') 

Now the new df is ready to be converted to a genind object

ssr.complete.genind <- df2genind(ssr.df.for.adegenet[,3:11],
                                 ploidy=2,
                                 sep='/', 
                                 pop = ssr.df.for.adegenet$pop, 
                                 ind.names = ssr.df.for.adegenet$Reference.variety, 
                                 loc.names = names(ssr.df.for.adegenet)[3:11]
                                 )

ssr.complete.genind
## /// GENIND OBJECT /////////
## 
##  // 3,221 individuals; 9 loci; 212 alleles; size: 2.9 Mb
## 
##  // Basic content
##    @tab:  3221 x 212 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 18-35)
##    @loc.fac: locus factor for the 212 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = ssr.df.for.adegenet[, 3:11], sep = "/", ind.names = ssr.df.for.adegenet$Reference.variety, 
##     loc.names = names(ssr.df.for.adegenet)[3:11], pop = ssr.df.for.adegenet$pop, 
##     ploidy = 2)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 12-939)
length(unique(pop(ssr.complete.genind)))
## [1] 13
#gac <- genotype_curve(ssr.complete.genind, sample = 1000, quiet = TRUE)

The main aim of my exploration is to find out if 9 microsatellite markers are enough to identify geographic clusters in the more than 3000 cultivars. Actually, we already have a geographic classification (13 groups), so we will compare the one we already have with the one performed by the function find.clusters from the adegenet package. Before running the command with n.pca and n.clust parameters, it is recommended to run it without and choose these 2 values interactively. I did this before and come with the following. As first thing I try to match the 13 a priori groups with 13 picked by the function

length(unique(pop(ssr.complete.genind)))
## [1] 13
grp <- find.clusters(ssr.complete.genind, 
                     max.n.clust=100, 
                     n.pca = 250,
                     n.clust = 12)

table(pop(ssr.complete.genind), grp$grp)
##       
##          1   2   3   4   5   6   7   8   9  10  11  12
##   ASIA  19  42  10   1  15  11   4  56  19  14  26   9
##   AUS    0   0   0   0   1   1   0   9   0   0   1   0
##   CE     9   0  13  27  13  28  78  16   5  12   1   2
##   EE    51  53  34  12  43  75  15  86  80 116  23  58
##   NE     0   0   0   1   4   0   0   6   3   0   0   0
##   NoA   34   0   3   1   6   2   0  75   2   1   9   6
##   SA     1   0   0   0   3   0   0   9   2   0   0   0
##   SE    61  57 143  36  82  64  13  47  50  44  19  39
##   STAN   0   1   7   0  16  12   3  32  17   4  63  11
##   SUSA   8   8   0   0   3   0   0   1  13   0   5   0
##   USA    2   2   0   1  10   0   1   3   4   1  14   3
##   WE   127  36  74  87  26  51  99 174  33  64  46 122
##   Unk    7   8   3   6   9   8  10  29  16  15   6   9
table.value(table(pop(ssr.complete.genind), grp$grp), col.lab=paste("Inf", 1:12),
            row.lab=levels(ssr.complete$new))

Not a really clear overlapping between the two classifications. Let’s try to plot the DAPC object after calling the function dapc

# evaluate this interactively before
dapc <- dapc(ssr.complete.genind,
             n.pca=100,
             n.da=3,
             pop = grp$grp)

scatter(dapc)

# dapc.a.priori <- dapc(ssr.complete.genind,
#                       n.pca=100,
#                       n.da=3,
#                       pop = pop(ssr.complete.genind))
# 
# scatter(dapc.a.priori)

Since we are not very satisfied with the results, we can try the new method implemented in adegenet via the function snapclust. We will apply this function to the same genind object, but first we will try to identify the optimal number of clusters using the function snapclust.choose.k.

vivc.aic <- snapclust.choose.k(40, ssr.complete.genind)

plot(vivc.aic, type = "b", cex = 2, xlab = "k", ylab = "AIC")

points(which.min(vivc.aic), min(vivc.aic), col = "blue", pch = 20, cex = 2)

abline(v = 35, lty = 2, col = "red")

Repeat the same steps with BIC values instead

vivc.bic <- snapclust.choose.k(40, ssr.complete.genind, IC = BIC)

plot(vivc.bic, type = "b", cex = 2, xlab = "k", ylab = "BIC")

points(which.min(vivc.bic), min(vivc.bic), col = "blue", pch = 20, cex = 2)

abline(v = 12, lty = 2, col = "red")

From the BIC evaluations it looks like there is a good improvement in finding a clear minimun value that can help in defining a correct number of clusters. From raw values, the minimun is when K=7, but K={8,9,10} is also probable. Considering that from my human opinable classification I defined 12 groups, this smaller number of cluster can help refine my classification. Let’s do some plotting

# we impose K=7 as first attempt
vivc.clust <- snapclust(ssr.complete.genind, k = 7)

head(vivc.clust$group, 7)
## ZWEIGELTREBE.BLAU   VELTLINER.BRAUN      KOELNER.BLAU        RASTIGNIER 
##                 1                 2                 3                 4 
##      LAGLER.WEISS      HAINER.GRUEN           ROESLER 
##                 1                 3                 3 
## Levels: 1 2 3 4 5 6 7
length(vivc.clust$group)
## [1] 3221
head(round(vivc.clust$proba),3)
##                   1 2 3 4 5 6 7
## ZWEIGELTREBE.BLAU 1 0 0 0 0 0 0
## VELTLINER.BRAUN   0 1 0 0 0 0 0
## KOELNER.BLAU      0 0 1 0 0 0 0
vivc.clust$converged
## [1] TRUE
a.tab <- table(pop(ssr.complete.genind), vivc.clust$group)

a.tab
##       
##          1   2   3   4   5   6   7
##   ASIA   4  58  26  18  17  54  49
##   AUS    0   9   0   1   0   0   2
##   CE   131  16  31  14   9   1   2
##   EE    34  88 253  78  33  86  74
##   NE     2   7   0   5   0   0   0
##   NoA    1  75   4   1  34   0  24
##   SA     0   9   0   5   1   0   0
##   SE   102  55 174 139  63  78  44
##   STAN   3  34  20   6   0   4  99
##   SUSA   0   3   0   9   8  14   4
##   USA    3   4   0   9   2   4  19
##   WE   266 180  56  57 131  39 210
##   Unk   21  30  19  24   5  12  15
table.value(a.tab, col.labels = 1:7)

# some colorful image
compoplot(vivc.clust)

# dapc again
seven.dapc <- dapc(ssr.complete.genind, 
                   n.pca = 100, 
                   n.da = 3, 
                   grp = vivc.clust$group)

scatter(seven.dapc, clab = 0.85, col = funky(24),
        posi.da="topleft", posi.pca = "bottomleft",
        scree.pca = TRUE, grp = vivc.clust$group)